Benchmark using Max Quant HER2 data and comparing the result with MaxQuant Discovery workflow

mq_data_input <- "../../../data-s3/data-formats/maxquant/LFQ/HER2/data/"
protein.intensities <- read.csv(file.path(mq_data_input, "proteinGroups.txt"), sep = "\t", stringsAsFactors = F)
design <- read.csv(file.path(mq_data_input, "experimentDesign_original.txt"), sep = "\t", stringsAsFactors = F)

input_gen <- MaxQuantTranform(proteinGroups=protein.intensities, design = design, species = "human", useNormalisationMethod = "None", labellingMethod = "LFQ")
## Joining, by = "mqExperiment"
data_int <- input_gen$intensities
data_int[data_int$ProteinId == "Q9Y2F5",]
##      LFQ.intensity.1_hu_C1 LFQ.intensity.2_hu_P1 LFQ.intensity.3_hu_C2
## 1804             145650000                     0             271210000
##      LFQ.intensity.4_hu_P2 LFQ.intensity.5_hu_C3 LFQ.intensity.6_hu_P3
## 1804                     0             275220000                     0
##      ProteinId
## 1804    Q9Y2F5
protein.intensities[grep("Q9Y2F5", protein.intensities$Protein.IDs),] %>% select(starts_with("LFQ.intensity"))
##      LFQ.intensity.1_hu_C1 LFQ.intensity.2_hu_P1 LFQ.intensity.3_hu_C2
## 1804             145650000                     0             271210000
##      LFQ.intensity.4_hu_P2 LFQ.intensity.5_hu_C3 LFQ.intensity.6_hu_P3
## 1804                     0             275220000                     0

Load MaxQuant discovery

output_folder <- "../../../data-s3/data-formats/maxquant/LFQ/HER2/data/output/"
library(drake)
## 
## Attaching package: 'drake'
## The following object is masked from 'package:plotly':
## 
##     config
## The following objects are masked from 'package:tidyr':
## 
##     expand, gather
## The following object is masked from 'package:S4Vectors':
## 
##     expand
# library(LFQprocessing)
# her2 <- protein_quant_runner(upload_folder, output_folder)

out_data <- read.delim(file.path(output_folder, "proteinGroups_quant.txt")) %>%
  dplyr::rename(Majority.protein.IDs=majority.protein.ids,
                Fasta.headers=fasta.headers)
out_data_1prot <- assign_uniprot_ids_mq(out_data)
out_data_bench <- out_data_1prot %>% dplyr::select("ProteinId",
                                             starts_with("logFC."), 
                                             starts_with("adj.P.Val."), 
                                             starts_with("P.Value.")) 
colnames(out_data_bench)[2:ncol(out_data_bench)] <- paste0("Disco_", colnames(out_data_bench)[2:ncol(out_data_bench)])


out_data_1prot[out_data_1prot$ProteinId == "Q9Y2F5",] %>% select(starts_with("LFQ.intensity"))
##      lfq.intensity.1_hu_c1 lfq.intensity.3_hu_c2 lfq.intensity.5_hu_c3
## 1774             145650000             271210000             275220000
##      lfq.intensity.2_hu_p1 lfq.intensity.4_hu_p2 lfq.intensity.6_hu_p3
## 1774                     0                     0                     0

Compare results

load("../tests/data/mq_lfq_output.Rdata")

expected_complete <- rowData(expectedCompleteIntensityExperiment)
data <- assay(expectedIntensityExperiment)

data_int[data_int$ProteinId == "Q9Y2F5",]
##      LFQ.intensity.1_hu_C1 LFQ.intensity.2_hu_P1 LFQ.intensity.3_hu_C2
## 1804             145650000                     0             271210000
##      LFQ.intensity.4_hu_P2 LFQ.intensity.5_hu_C3 LFQ.intensity.6_hu_P3
## 1804                     0             275220000                     0
##      ProteinId
## 1804    Q9Y2F5
sum(!(out_data_bench$ProteinId %in% expected_complete$ProteinId))
## [1] 0
sum(!(expected_complete$ProteinId %in% out_data_bench$ProteinId))
## [1] 0
compare <- out_data_bench %>% left_join(as_tibble(expected_complete))
## Joining, by = "ProteinId"
p=ggplot(compare, aes(x=-log10(`P.Value.AZD8931_resistant_SKBR3_AZDRc.Parental_SKBR3`), 
                    y = -log10(`Disco_P.Value.AZD8931_resistant_SKBR3_AZDRc...Parental_SKBR3`), label=ProteinId)) + 
  geom_point() +
  theme_bw() 

ggplotly(p)
ggplot(compare, aes(x=`logFC.AZD8931_resistant_SKBR3_AZDRc.Parental_SKBR3`, 
                    y = `Disco_logFC.AZD8931_resistant_SKBR3_AZDRc...Parental_SKBR3`)) + 
  geom_point() +
  theme_bw() +
  geom_smooth()
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 180 rows containing non-finite values (stat_smooth).
## Warning: Removed 180 rows containing missing values (geom_point).

Compare results 2

Run MassExpression discovery

intensities <- mq_lfq_data$intensities
design <- mq_lfq_data$design
parameters <- mq_lfq_data$parameters
normalisation_method <- parameters[parameters[,1] == "UseNormalisationMethod",2]
species <- parameters[parameters[,1] == "Species",2]
labellingMethod <- parameters[parameters[,1] == "LabellingMethod",2]


results <- runGenericDiscovery(experimentDesign = design, 
                               proteinIntensities = intensities, 
                               normalisationMethod = normalisation_method, 
                               species = species, 
                               labellingMethod = labellingMethod)
## Joining, by = "SampleName"
completeExperiment <- results$CompleteIntensityExperiment
expected_complete_new <- rowData(completeExperiment)

sum(!(out_data_bench$ProteinId %in% expected_complete_new$ProteinId))
## [1] 0
sum(!(expected_complete_new$ProteinId %in% out_data_bench$ProteinId))
## [1] 0
compare <- out_data_bench %>% left_join(as_tibble(expected_complete_new))
## Joining, by = "ProteinId"
ggplot(compare, aes(x=`logFC.AZD8931_resistant_SKBR3_AZDRc.Parental_SKBR3`, 
                    y = `Disco_logFC.AZD8931_resistant_SKBR3_AZDRc...Parental_SKBR3`)) + 
  geom_point() +
  theme_bw() +
  geom_smooth()
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 180 rows containing non-finite values (stat_smooth).
## Warning: Removed 180 rows containing missing values (geom_point).

Two runs of mass expression

colnames(expected_complete)[2:ncol(expected_complete)] <- paste0("saved_",colnames(expected_complete)[2:ncol(expected_complete)])

colnames(expected_complete_new)[2:ncol(expected_complete_new)] <- paste0("new_",colnames(expected_complete_new)[2:ncol(expected_complete_new)])

compare_me <- as_tibble(expected_complete) %>% left_join(as_tibble(expected_complete_new))
## Joining, by = "ProteinId"
ggplot(compare_me, aes(x=`saved_logFC.AZD8931_resistant_SKBR3_AZDRc.Parental_SKBR3`, 
                    y = `new_logFC.AZD8931_resistant_SKBR3_AZDRc.Parental_SKBR3`)) + 
  geom_point() +
  theme_bw() +
  geom_smooth()
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 180 rows containing non-finite values (stat_smooth).
## Warning: Removed 180 rows containing missing values (geom_point).

Drake

library(visNetwork)
plan <- drake_plan(runGenericDiscovery(experimentDesign = design, 
                               proteinIntensities = intensities, 
                               normalisationMethod = normalisation_method, 
                               species = species, 
                               labellingMethod = labellingMethod))

vis_drake_graph(plan)
design <- mq_lfq_data$design
intensities <- mq_lfq_data$intensities
parameters <- mq_lfq_data$parameters
normalisation_method <- parameters[parameters[,1] == "UseNormalisationMethod",2]
species <- parameters[parameters[,1] == "Species",2]
labellingMethod <- parameters[parameters[,1] == "LabellingMethod",2]

listMetadata <- list(Species = species, 
                     LabellingMethod = labellingMethod, 
                     NormalisationAppliedToAssay = "None")

# Create Data Rep
IntensityExperiment <- constructSummarizedExperiment(experimentDesign = design, 
                                                     proteinIntensities = intensities, 
                                                     listMetadata = listMetadata)

normalisationMethod <- "None"
plan_limma <- drake_plan(
    longIntensityDT <- initialiseLongIntensityDT(IntensityExperiment),
  
  # Create Median Normalized Measurements in each Condition/Replicate
  longIntensityDT <- normaliseIntensity(longIntensityDT=longIntensityDT,
                                        normalisationMethod=normalisationMethod),
  
  # Imputation
  longIntensityDT <- imputeLFQ(longIntensityDT, 
                         id_type = "ProteinId", 
                         int_type = "log2NIntNorm",
                         f_imputeStDev = 0.3,
                         f_imputePosition= 1.8),
  
  
  # RunId will be unique to a row wherease replicate may not
  longIntensityDT[, RunId := str_c(Condition, Replicate, sep = ".")],
  
  # Run LIMMA
  resultsQuant <- limmaStatsFun(ID_type = "ProteinId",
                                   int_type = "log2NIntNorm",
                                   condition_col_name = "Condition",
                                   run_id_col_name = "RunId",
                                   rep_col_name = "Replicate",
                                   funDT = longIntensityDT),
  stats = resultsQuant[["stats"]],
  conditionComparisonMapping = resultsQuant[["conditionComparisonMapping"]],
  
  # SummarizedExperiment which contains the complete essay with imputed data and statistics
  # about the number of proteins imputed in each condition 
  CompleteIntensityExperiment <- createCompleteIntensityExperiment(IntensityExperiment,
                                                                   limmaStats=stats,
                                                                   normalisationAppliedToAssay = normalisationMethod,
                                                                   longIntensityDT = longIntensityDT,
                                                                   conditionComparisonMapping = conditionComparisonMapping)
  
)
## Warning: Use `=` instead of `<-` or `->` to assign targets to commands in `drake_plan()`. For example, write `drake_plan(a = 1)` instead of `drake_plan(a <- 1)`. Arrows were used to declare these commands:
##   longIntensityDT <- initialiseLongIntensityDT(IntensityExperiment)
##   longIntensityDT <- normaliseIntensity(longIntensityDT = longIntensityDT, 
##     normalisationMethod = normalisationMethod)
##   longIntensityDT <- imputeLFQ(longIntensityDT, id_type = "ProteinId", 
##     int_type = "log2NIntNorm", f_imputeStDev = 0.3, f_imputePosition = 1.8)
##   resultsQuant <- limmaStatsFun(ID_type = "ProteinId", int_type = "log2NIntNorm", 
##     condition_col_name = "Condition", run_id_col_name = "RunId", 
##     rep_col_name = "Replicate", funDT = longIntensityDT)
##   CompleteIntensityExperiment <- createCompleteIntensityExperiment(IntensityExperiment, 
##     limmaStats = stats, normalisationAppliedToAssay = normalisationMethod, 
##     longIntensityDT = longIntensityDT, conditionComparisonMapping = conditionComparisonMapping)
vis_drake_graph(plan_limma)